{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载必要的库\n",
    "\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sklearn\n",
    "import random\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.7.3 (default, Apr 24 2020, 18:51:23) \n",
      "[Clang 11.0.3 (clang-1103.0.32.62)]\n"
     ]
    }
   ],
   "source": [
    "print(sys.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn import feature_selection\n",
    "from sklearn import model_selection\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第一步：加载源数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_raw = pd.read_csv('train.csv') \n",
    "\n",
    "data_val = pd.read_csv('test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 显示部分数据\n",
    "\n",
    "data_raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>892</td>\n",
       "      <td>3</td>\n",
       "      <td>Kelly, Mr. James</td>\n",
       "      <td>male</td>\n",
       "      <td>34.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>330911</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>893</td>\n",
       "      <td>3</td>\n",
       "      <td>Wilkes, Mrs. James (Ellen Needs)</td>\n",
       "      <td>female</td>\n",
       "      <td>47.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>363272</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>894</td>\n",
       "      <td>2</td>\n",
       "      <td>Myles, Mr. Thomas Francis</td>\n",
       "      <td>male</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>240276</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>895</td>\n",
       "      <td>3</td>\n",
       "      <td>Wirz, Mr. Albert</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>315154</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>896</td>\n",
       "      <td>3</td>\n",
       "      <td>Hirvonen, Mrs. Alexander (Helga E Lindqvist)</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3101298</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Pclass                                          Name     Sex  \\\n",
       "0          892       3                              Kelly, Mr. James    male   \n",
       "1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   \n",
       "2          894       2                     Myles, Mr. Thomas Francis    male   \n",
       "3          895       3                              Wirz, Mr. Albert    male   \n",
       "4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   \n",
       "\n",
       "    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  \n",
       "0  34.5      0      0   330911   7.8292   NaN        Q  \n",
       "1  47.0      1      0   363272   7.0000   NaN        S  \n",
       "2  62.0      0      0   240276   9.6875   NaN        Q  \n",
       "3  27.0      0      0   315154   8.6625   NaN        S  \n",
       "4  22.0      1      1  3101298  12.2875   NaN        S  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_val.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  891 non-null    int64  \n",
      " 1   Survived     891 non-null    int64  \n",
      " 2   Pclass       891 non-null    int64  \n",
      " 3   Name         891 non-null    object \n",
      " 4   Sex          891 non-null    object \n",
      " 5   Age          714 non-null    float64\n",
      " 6   SibSp        891 non-null    int64  \n",
      " 7   Parch        891 non-null    int64  \n",
      " 8   Ticket       891 non-null    object \n",
      " 9   Fare         891 non-null    float64\n",
      " 10  Cabin        204 non-null    object \n",
      " 11  Embarked     889 non-null    object \n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data_raw.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 418 entries, 0 to 417\n",
      "Data columns (total 11 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  418 non-null    int64  \n",
      " 1   Pclass       418 non-null    int64  \n",
      " 2   Name         418 non-null    object \n",
      " 3   Sex          418 non-null    object \n",
      " 4   Age          332 non-null    float64\n",
      " 5   SibSp        418 non-null    int64  \n",
      " 6   Parch        418 non-null    int64  \n",
      " 7   Ticket       418 non-null    object \n",
      " 8   Fare         417 non-null    float64\n",
      " 9   Cabin        91 non-null     object \n",
      " 10  Embarked     418 non-null    object \n",
      "dtypes: float64(2), int64(4), object(5)\n",
      "memory usage: 36.0+ KB\n"
     ]
    }
   ],
   "source": [
    "data_val.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 列名称转换为小写格式\n",
    "\n",
    "data_raw.columns = data_raw.columns.str.lower() #  转换为小写\n",
    "\n",
    "data_val.columns = data_val.columns.str.lower() # 转换为小写"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>passengerid</th>\n",
       "      <th>survived</th>\n",
       "      <th>pclass</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>ticket</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   passengerid  survived  pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                name     sex   age  sibsp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   parch            ticket     fare cabin embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x125831668>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEGCAYAAACKB4k+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAPQUlEQVR4nO3dfbDmZV3H8fcHFqR84MHdNtyllpLJoRTFE5HaVJAFZC5jgjgaK+7M1gw1OmZG/ZEPQ42OlmEatRPqQiUgZmxmGrNApgPq2UQeMzeC2A3cI0+KZLn27Y9z7cVhObvcZ9nfuc9y3q+Ze+7rd/2u3+/+3szO+XD9nu5UFZIkARww7gIkSQuHoSBJ6gwFSVJnKEiSOkNBktQtGXcBT8TSpUtr1apV4y5DkvYrmzdv/npVLZtt3X4dCqtWrWJycnLcZUjSfiXJnbtb5+EjSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUrdf39G8L7zwty4edwlagDa/++xxlyCNhTMFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkbNBSS3JHkpiQ3JJlsfUckuSrJV9v74a0/Sd6XZEuSG5McP2RtkqTHmo+Zws9W1fOraqItnwdsqqpjgE1tGeBU4Jj2WgdcOA+1SZJmGMfho9XAhtbeAJw+o//imnY9cFiSI8dQnyQtWkOHQgH/mGRzknWtb3lV3d3a9wDLW3sFcNeMbbe2vkdJsi7JZJLJqampoeqWpEVp6J/jfElVbUvyfcBVSf515sqqqiQ1lx1W1XpgPcDExMSctpUk7dmgM4Wq2tbetwMfB04AvrbzsFB7396GbwOOmrH5ytYnSZong4VCkqcmefrONvDzwM3ARmBNG7YGuLK1NwJnt6uQTgQenHGYSZI0D4Y8fLQc+HiSnZ/z11X1qSRfBC5Psha4Ezizjf8kcBqwBXgYOGfA2iRJsxgsFKrqduC4WfrvBU6epb+Ac4eqR5L0+LyjWZLUGQqSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkzFCRJnaEgSeoMBUlSZyhIkjpDQZLUGQqSpM5QkCR1hoIkqTMUJEnd4KGQ5MAkX0ryibZ8dJLPJ9mS5LIkB7f+p7TlLW39qqFrkyQ92nzMFN4A3DZj+V3Ae6vq2cD9wNrWvxa4v/W/t42TJM2jQUMhyUrgF4G/aMsBTgKuaEM2AKe39uq2TFt/chsvSZonQ88U/hh4C/B/bfmZwANVtaMtbwVWtPYK4C6Atv7BNv5RkqxLMplkcmpqasjaJWnRGSwUkrwM2F5Vm/flfqtqfVVNVNXEsmXL9uWuJWnRWzLgvl8MvDzJacAhwDOAC4DDkixps4GVwLY2fhtwFLA1yRLgUODeAeuTJO1isJlCVf1OVa2sqlXAWcDVVfUa4BrglW3YGuDK1t7Ylmnrr66qGqo+SdJjjeM+hd8G3pRkC9PnDC5q/RcBz2z9bwLOG0NtkrSoDXn4qKuqa4FrW/t24IRZxnwbOGM+6pEkzc47miVJnaEgSeoMBUlSZyhIkjpDQZLUGQqSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpm5cf2ZE0d//5jueOuwQtQD/wezcNun9nCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1I0UCkk2jdInSdq/7fGO5iSHAN8LLE1yOJC26hnAioFrkyTNs8d7zMWvAm8EngVs5pFQ+Abw/gHrkiSNwR4PH1XVBVV1NPDmqvqhqjq6vY6rqj2GQpJDknwhyZeT3JLk7a3/6CSfT7IlyWVJDm79T2nLW9r6VfvoO0qSRjTSA/Gq6k+SvAhYNXObqrp4D5v9D3BSVT2U5CDgs0n+AXgT8N6qujTJnwFrgQvb+/1V9ewkZwHvAl61N19KkrR3Rj3RfAnwHuAlwI+318SetqlpD7XFg9qrgJOAK1r/BuD01l7dlmnrT06y83CVJGkejPro7Ang2Kqquew8yYFMn4t4NvAB4N+BB6pqRxuylUdOWK8A7gKoqh1JHgSeCXx9Lp8pSdp7o96ncDPw/XPdeVV9t6qeD6wETgCeM9d97CrJuiSTSSanpqae6O4kSTOMOlNYCtya5AtMnysAoKpePsrGVfVAkmuAnwQOS7KkzRZWAtvasG3AUcDWJEuAQ4F7Z9nXemA9wMTExJxmLpKkPRs1FN421x0nWQZ8pwXC9wAvZfrk8TXAK4FLgTXAlW2TjW35urb+6rkerpIkPTGjXn30T3ux7yOBDe28wgHA5VX1iSS3ApcmOR/4EnBRG38RcEmSLcB9wFl78ZmSpCdgpFBI8k2mrxwCOJjpK4m+VVXP2N02VXUj8IJZ+m9n+vzCrv3fBs4YpR5J0jBGnSk8fWe7XSa6GjhxqKIkSeMx56ektvsP/hb4hQHqkSSN0aiHj14xY/EApu9b+PYgFUmSxmbUq49+aUZ7B3AH04eQJElPIqOeUzhn6EIkSeM36rOPVib5eJLt7fWxJCuHLk6SNL9GPdH8IaZvLntWe/1d65MkPYmMGgrLqupDVbWjvT4MLBuwLknSGIwaCvcmeW2SA9vrtczyXCJJ0v5t1FB4PXAmcA9wN9PPJnrdQDVJksZk1EtS3wGsqar7AZIcwfSP7rx+qMIkSfNv1JnC83YGAkBV3ccszzWSJO3fRg2FA5IcvnOhzRRGnWVIkvYTo/5h/0PguiQfbctnAL8/TEmSpHEZ9Y7mi5NMAie1rldU1a3DlSVJGoeRDwG1EDAIJOlJbM6PzpYkPXkZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkzFCRJ3WChkOSoJNckuTXJLUne0PqPSHJVkq+298Nbf5K8L8mWJDcmOX6o2iRJsxtyprAD+M2qOhY4ETg3ybHAecCmqjoG2NSWAU4FjmmvdcCFA9YmSZrFYKFQVXdX1b+09jeB24AVwGpgQxu2ATi9tVcDF9e064HDkhw5VH2SpMeal3MKSVYBLwA+DyyvqrvbqnuA5a29ArhrxmZbW9+u+1qXZDLJ5NTU1GA1S9JiNHgoJHka8DHgjVX1jZnrqqqAmsv+qmp9VU1U1cSyZcv2YaWSpEFDIclBTAfCX1XV37Tur+08LNTet7f+bcBRMzZf2fokSfNkyKuPAlwE3FZVfzRj1UZgTWuvAa6c0X92uwrpRODBGYeZJEnzYMmA+34x8CvATUluaH2/C7wTuDzJWuBO4My27pPAacAW4GHgnAFrkyTNYrBQqKrPAtnN6pNnGV/AuUPVI0l6fN7RLEnqDAVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkzFCRJnaEgSeoMBUlSZyhIkjpDQZLUGQqSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqRusFBI8sEk25PcPKPviCRXJflqez+89SfJ+5JsSXJjkuOHqkuStHtDzhQ+DJyyS995wKaqOgbY1JYBTgWOaa91wIUD1iVJ2o3BQqGqPgPct0v3amBDa28ATp/Rf3FNux44LMmRQ9UmSZrdfJ9TWF5Vd7f2PcDy1l4B3DVj3NbW9xhJ1iWZTDI5NTU1XKWStAiN7URzVRVQe7Hd+qqaqKqJZcuWDVCZJC1e8x0KX9t5WKi9b2/924CjZoxb2fokSfNovkNhI7CmtdcAV87oP7tdhXQi8OCMw0ySpHmyZKgdJ/kI8DPA0iRbgbcC7wQuT7IWuBM4sw3/JHAasAV4GDhnqLokSbs3WChU1at3s+rkWcYWcO5QtUiSRuMdzZKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkzFCRJnaEgSeoMBUlSZyhIkjpDQZLUGQqSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqFlQoJDklyVeSbEly3rjrkaTFZsGEQpIDgQ8ApwLHAq9Ocux4q5KkxWXBhAJwArClqm6vqv8FLgVWj7kmSVpUloy7gBlWAHfNWN4K/MSug5KsA9a1xYeSfGUealsslgJfH3cRC0Hes2bcJejR/Le501uzL/byg7tbsZBCYSRVtR5YP+46noySTFbVxLjrkHblv835s5AOH20DjpqxvLL1SZLmyUIKhS8CxyQ5OsnBwFnAxjHXJEmLyoI5fFRVO5L8OvBp4EDgg1V1y5jLWmw8LKeFyn+b8yRVNe4aJEkLxEI6fCRJGjNDQZLUGQry8SJasJJ8MMn2JDePu5bFwlBY5Hy8iBa4DwOnjLuIxcRQkI8X0YJVVZ8B7ht3HYuJoaDZHi+yYky1SBozQ0GS1BkK8vEikjpDQT5eRFJnKCxyVbUD2Pl4kduAy328iBaKJB8BrgN+JMnWJGvHXdOTnY+5kCR1zhQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkK0kCSvHxfPXU2yUP7Yj/S4/GSVOkJSLKk3esx9Oc8VFVPG/pzJGcKEpDkqUn+PsmXk9yc5FVJ7kiytK2fSHJta78tySVJPgdckuT6JD86Y1/XtvGvS/L+JIcmuTPJATM+664kByX54SSfSrI5yT8neU4bc3SS65LclOT8+f8vosXKUJCmnQL8V1UdV1U/BnzqccYfC/xcVb0auAw4EyDJkcCRVTW5c2BVPQjcAPx063oZ8Omq+g7TP0j/G1X1QuDNwJ+2MRcAF1bVc4G798UXlEZhKEjTbgJemuRdSX6q/SHfk41V9d+tfTnwytY+E7hilvGXAa9q7bOAy5I8DXgR8NEkNwB/DhzZxrwY+EhrXzLnbyPtpSXjLkBaCKrq35IcD5wGnJ9kE7CDR/7H6ZBdNvnWjG23Jbk3yfOY/sP/a7N8xEbgD5IcAbwQuBp4KvBAVT1/d2Xt9ReS9pIzBQlI8izg4ar6S+DdwPHAHUz/AQf45cfZxWXAW4BDq+rGXVdW1UNMP5H2AuATVfXdqvoG8B9Jzmg1JMlxbZPPMT2jAHjNXn8xaY4MBWnac4EvtMM4bwXOB94OXJBkEvju42x/BdN/xC/fw5jLgNe2951eA6xN8mXgFh75KdQ3AOcmuQl/CU/zyEtSJUmdMwVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkzFCRJ3f8DThe6X9gR+9IAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制图形\n",
    "\n",
    "sns.countplot(data_raw['survived'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 合并两个数据集，进行统一的清洗\n",
    "\n",
    "data_all = [data_raw, data_val]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "passengerid      0\n",
       "survived         0\n",
       "pclass           0\n",
       "name             0\n",
       "sex              0\n",
       "age            177\n",
       "sibsp            0\n",
       "parch            0\n",
       "ticket           0\n",
       "fare             0\n",
       "cabin          687\n",
       "embarked         2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw.isnull().sum() # 查看训练集中的空值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "passengerid      0\n",
       "pclass           0\n",
       "name             0\n",
       "sex              0\n",
       "age             86\n",
       "sibsp            0\n",
       "parch            0\n",
       "ticket           0\n",
       "fare             1\n",
       "cabin          327\n",
       "embarked         0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_val.isnull().sum() # 查看验证集是否有空值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>passengerid</th>\n",
       "      <th>survived</th>\n",
       "      <th>pclass</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>ticket</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891</td>\n",
       "      <td>891</td>\n",
       "      <td>714.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>204</td>\n",
       "      <td>889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>891</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>681</td>\n",
       "      <td>NaN</td>\n",
       "      <td>147</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Stephenson, Mrs. Walter Bertram (Martha Eustis)</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1601</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C23 C25 C27</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>577</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>644</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>446.000000</td>\n",
       "      <td>0.383838</td>\n",
       "      <td>2.308642</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.699118</td>\n",
       "      <td>0.523008</td>\n",
       "      <td>0.381594</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.204208</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>257.353842</td>\n",
       "      <td>0.486592</td>\n",
       "      <td>0.836071</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.526497</td>\n",
       "      <td>1.102743</td>\n",
       "      <td>0.806057</td>\n",
       "      <td>NaN</td>\n",
       "      <td>49.693429</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.420000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>223.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.910400</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>446.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>28.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.454200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>668.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>891.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>512.329200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        passengerid    survived      pclass  \\\n",
       "count    891.000000  891.000000  891.000000   \n",
       "unique          NaN         NaN         NaN   \n",
       "top             NaN         NaN         NaN   \n",
       "freq            NaN         NaN         NaN   \n",
       "mean     446.000000    0.383838    2.308642   \n",
       "std      257.353842    0.486592    0.836071   \n",
       "min        1.000000    0.000000    1.000000   \n",
       "25%      223.500000    0.000000    2.000000   \n",
       "50%      446.000000    0.000000    3.000000   \n",
       "75%      668.500000    1.000000    3.000000   \n",
       "max      891.000000    1.000000    3.000000   \n",
       "\n",
       "                                                   name   sex         age  \\\n",
       "count                                               891   891  714.000000   \n",
       "unique                                              891     2         NaN   \n",
       "top     Stephenson, Mrs. Walter Bertram (Martha Eustis)  male         NaN   \n",
       "freq                                                  1   577         NaN   \n",
       "mean                                                NaN   NaN   29.699118   \n",
       "std                                                 NaN   NaN   14.526497   \n",
       "min                                                 NaN   NaN    0.420000   \n",
       "25%                                                 NaN   NaN   20.125000   \n",
       "50%                                                 NaN   NaN   28.000000   \n",
       "75%                                                 NaN   NaN   38.000000   \n",
       "max                                                 NaN   NaN   80.000000   \n",
       "\n",
       "             sibsp       parch ticket        fare        cabin embarked  \n",
       "count   891.000000  891.000000    891  891.000000          204      889  \n",
       "unique         NaN         NaN    681         NaN          147        3  \n",
       "top            NaN         NaN   1601         NaN  C23 C25 C27        S  \n",
       "freq           NaN         NaN      7         NaN            4      644  \n",
       "mean      0.523008    0.381594    NaN   32.204208          NaN      NaN  \n",
       "std       1.102743    0.806057    NaN   49.693429          NaN      NaN  \n",
       "min       0.000000    0.000000    NaN    0.000000          NaN      NaN  \n",
       "25%       0.000000    0.000000    NaN    7.910400          NaN      NaN  \n",
       "50%       0.000000    0.000000    NaN   14.454200          NaN      NaN  \n",
       "75%       1.000000    0.000000    NaN   31.000000          NaN      NaN  \n",
       "max       8.000000    6.000000    NaN  512.329200          NaN      NaN  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 对源数据集进行描述\n",
    "\n",
    "data_raw.describe(include='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对原始数据集（训练集+验证集）进行清理\n",
    "\n",
    "for dataset in data_all:\n",
    "    # 补足空缺值\n",
    "    dataset['age'].fillna(dataset['age'].median(), inplace=True)\n",
    "    dataset['fare'].fillna(dataset['fare'].median(), inplace=True)\n",
    "    dataset['embarked'].fillna(dataset['embarked'].mode()[0], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删除一些字段\n",
    "drop_columns = ['cabin', 'passengerid', 'ticket']\n",
    "\n",
    "data_raw.drop(drop_columns, axis=1, inplace=True)\n",
    "\n",
    "data_val.drop(drop_columns, axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "survived    0\n",
       "pclass      0\n",
       "name        0\n",
       "sex         0\n",
       "age         0\n",
       "sibsp       0\n",
       "parch       0\n",
       "fare        0\n",
       "embarked    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw.isnull().sum() # 训练集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pclass      0\n",
       "name        0\n",
       "sex         0\n",
       "age         0\n",
       "sibsp       0\n",
       "parch       0\n",
       "fare        0\n",
       "embarked    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_val.isnull().sum() # 验证集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第三步：进行特征构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tommy/Torch/Torch/lib/python3.7/site-packages/pandas/core/indexing.py:671: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n"
     ]
    }
   ],
   "source": [
    "for dataset in data_all:\n",
    "    # 构建新的字段：\n",
    "    # （1）family_size 家庭规模：sibsp + parch\n",
    "    dataset['family_size'] = dataset['sibsp'] + dataset['parch'] + 1\n",
    "    # （2）单身 single ,1 : 单身， 0 ：非单身\n",
    "    dataset['single'] = 1 \n",
    "    dataset['single'].loc[dataset['family_size'] > 1] = 0 # 0 ：不是单身\n",
    "    # （3）身份 title\n",
    "    dataset['title'] = dataset['name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]\n",
    "    # dataset['title'] = dataset['name'].apply(lambda x : x.split(,)[1]).apply(lambda x : x.split('.')[0])\n",
    "    # （4） 票价 fare_bin\n",
    "    dataset['fare_bin'] = pd.qcut(dataset['fare'], 4) # 根据票价，分成4组（每组的元素个数一致）\n",
    "    # （5）年龄 age_bin\n",
    "    dataset['age_bin'] = pd.cut(dataset['age'].astype(int), 5) # 根据年龄分组，分成5组（每组的元素不一致）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>embarked</th>\n",
       "      <th>family_size</th>\n",
       "      <th>single</th>\n",
       "      <th>title</th>\n",
       "      <th>fare_bin</th>\n",
       "      <th>age_bin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>Kelly, Mr. James</td>\n",
       "      <td>male</td>\n",
       "      <td>34.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>Q</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(-0.001, 7.896]</td>\n",
       "      <td>(30.4, 45.6]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>Wilkes, Mrs. James (Ellen Needs)</td>\n",
       "      <td>female</td>\n",
       "      <td>47.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>(-0.001, 7.896]</td>\n",
       "      <td>(45.6, 60.8]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Myles, Mr. Thomas Francis</td>\n",
       "      <td>male</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>Q</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(7.896, 14.454]</td>\n",
       "      <td>(60.8, 76.0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Wirz, Mr. Albert</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(7.896, 14.454]</td>\n",
       "      <td>(15.2, 30.4]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>Hirvonen, Mrs. Alexander (Helga E Lindqvist)</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>S</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>(7.896, 14.454]</td>\n",
       "      <td>(15.2, 30.4]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pclass                                          name     sex   age  sibsp  \\\n",
       "0       3                              Kelly, Mr. James    male  34.5      0   \n",
       "1       3              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1   \n",
       "2       2                     Myles, Mr. Thomas Francis    male  62.0      0   \n",
       "3       3                              Wirz, Mr. Albert    male  27.0      0   \n",
       "4       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1   \n",
       "\n",
       "   parch     fare embarked  family_size  single title         fare_bin  \\\n",
       "0      0   7.8292        Q            1       1    Mr  (-0.001, 7.896]   \n",
       "1      0   7.0000        S            2       0   Mrs  (-0.001, 7.896]   \n",
       "2      0   9.6875        Q            1       1    Mr  (7.896, 14.454]   \n",
       "3      0   8.6625        S            1       1    Mr  (7.896, 14.454]   \n",
       "4      1  12.2875        S            3       0   Mrs  (7.896, 14.454]   \n",
       "\n",
       "        age_bin  \n",
       "0  (30.4, 45.6]  \n",
       "1  (45.6, 60.8]  \n",
       "2  (60.8, 76.0]  \n",
       "3  (15.2, 30.4]  \n",
       "4  (15.2, 30.4]  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Mr              517\n",
       "Miss            182\n",
       "Mrs             125\n",
       "Master           40\n",
       "Dr                7\n",
       "Rev               6\n",
       "Major             2\n",
       "Col               2\n",
       "Mlle              2\n",
       "Mme               1\n",
       "Jonkheer          1\n",
       "Don               1\n",
       "Lady              1\n",
       "Ms                1\n",
       "the Countess      1\n",
       "Sir               1\n",
       "Capt              1\n",
       "Name: title, dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据title统计人数\n",
    "\n",
    "data_raw['title'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "title_names = (data_raw['title'].value_counts() < 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Mr              False\n",
       "Miss            False\n",
       "Mrs             False\n",
       "Master          False\n",
       "Dr               True\n",
       "Rev              True\n",
       "Major            True\n",
       "Col              True\n",
       "Mlle             True\n",
       "Mme              True\n",
       "Jonkheer         True\n",
       "Don              True\n",
       "Lady             True\n",
       "Ms               True\n",
       "the Countess     True\n",
       "Sir              True\n",
       "Capt             True\n",
       "Name: title, dtype: bool"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  title ：将那些称谓所在的人数小于10的数据，全部归为一类：other\n",
    "data_raw['title'] = data_raw['title'].apply(lambda x : 'other' if title_names[x] else x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Mr        517\n",
       "Miss      182\n",
       "Mrs       125\n",
       "Master     40\n",
       "other      27\n",
       "Name: title, dtype: int64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw['title'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "title\n",
       "Master    0.575000\n",
       "Miss      0.697802\n",
       "Mr        0.156673\n",
       "Mrs       0.792000\n",
       "other     0.444444\n",
       "Name: survived, dtype: float64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw['survived'].groupby(data_raw['title']).mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### （二）构建新的字段，基于scikit-learn中的LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>survived</th>\n",
       "      <th>pclass</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>embarked</th>\n",
       "      <th>family_size</th>\n",
       "      <th>single</th>\n",
       "      <th>title</th>\n",
       "      <th>fare_bin</th>\n",
       "      <th>age_bin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(-0.001, 7.91]</td>\n",
       "      <td>(16.0, 32.0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>(31.0, 512.329]</td>\n",
       "      <td>(32.0, 48.0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Miss</td>\n",
       "      <td>(7.91, 14.454]</td>\n",
       "      <td>(16.0, 32.0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>(31.0, 512.329]</td>\n",
       "      <td>(32.0, 48.0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(7.91, 14.454]</td>\n",
       "      <td>(32.0, 48.0]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   survived  pclass                                               name  \\\n",
       "0         0       3                            Braund, Mr. Owen Harris   \n",
       "1         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2         1       3                             Heikkinen, Miss. Laina   \n",
       "3         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4         0       3                           Allen, Mr. William Henry   \n",
       "\n",
       "      sex   age  sibsp  parch     fare embarked  family_size  single title  \\\n",
       "0    male  22.0      1      0   7.2500        S            2       0    Mr   \n",
       "1  female  38.0      1      0  71.2833        C            2       0   Mrs   \n",
       "2  female  26.0      0      0   7.9250        S            1       1  Miss   \n",
       "3  female  35.0      1      0  53.1000        S            2       0   Mrs   \n",
       "4    male  35.0      0      0   8.0500        S            1       1    Mr   \n",
       "\n",
       "          fare_bin       age_bin  \n",
       "0   (-0.001, 7.91]  (16.0, 32.0]  \n",
       "1  (31.0, 512.329]  (32.0, 48.0]  \n",
       "2   (7.91, 14.454]  (16.0, 32.0]  \n",
       "3  (31.0, 512.329]  (32.0, 48.0]  \n",
       "4   (7.91, 14.454]  (32.0, 48.0]  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "label = LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset in data_all:\n",
    "    # （1）新字段：sex_code\n",
    "    dataset['sex_code'] = label.fit_transform(dataset['sex'])\n",
    "    # （2）新字段：embarked_code\n",
    "    dataset['embarked_code'] = label.fit_transform(dataset['embarked'])\n",
    "    # （3）新字段：title_code\n",
    "    dataset['title_code'] = label.fit_transform(dataset['title'])\n",
    "    # （4）新字段：age_bin_code\n",
    "    dataset['age_bin_code'] = label.fit_transform(dataset['age_bin'])\n",
    "    # （5）新字段：fare_bin_code\n",
    "    dataset['fare_bin_code'] = label.fit_transform(dataset['fare_bin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>survived</th>\n",
       "      <th>pclass</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>embarked</th>\n",
       "      <th>family_size</th>\n",
       "      <th>single</th>\n",
       "      <th>title</th>\n",
       "      <th>fare_bin</th>\n",
       "      <th>age_bin</th>\n",
       "      <th>sex_code</th>\n",
       "      <th>embarked_code</th>\n",
       "      <th>title_code</th>\n",
       "      <th>age_bin_code</th>\n",
       "      <th>fare_bin_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(-0.001, 7.91]</td>\n",
       "      <td>(16.0, 32.0]</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>(31.0, 512.329]</td>\n",
       "      <td>(32.0, 48.0]</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Miss</td>\n",
       "      <td>(7.91, 14.454]</td>\n",
       "      <td>(16.0, 32.0]</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>(31.0, 512.329]</td>\n",
       "      <td>(32.0, 48.0]</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Mr</td>\n",
       "      <td>(7.91, 14.454]</td>\n",
       "      <td>(32.0, 48.0]</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   survived  pclass                                               name  \\\n",
       "0         0       3                            Braund, Mr. Owen Harris   \n",
       "1         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2         1       3                             Heikkinen, Miss. Laina   \n",
       "3         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4         0       3                           Allen, Mr. William Henry   \n",
       "\n",
       "      sex   age  sibsp  parch     fare embarked  family_size  single title  \\\n",
       "0    male  22.0      1      0   7.2500        S            2       0    Mr   \n",
       "1  female  38.0      1      0  71.2833        C            2       0   Mrs   \n",
       "2  female  26.0      0      0   7.9250        S            1       1  Miss   \n",
       "3  female  35.0      1      0  53.1000        S            2       0   Mrs   \n",
       "4    male  35.0      0      0   8.0500        S            1       1    Mr   \n",
       "\n",
       "          fare_bin       age_bin  sex_code  embarked_code  title_code  \\\n",
       "0   (-0.001, 7.91]  (16.0, 32.0]         1              2           2   \n",
       "1  (31.0, 512.329]  (32.0, 48.0]         0              0           3   \n",
       "2   (7.91, 14.454]  (16.0, 32.0]         0              2           1   \n",
       "3  (31.0, 512.329]  (32.0, 48.0]         0              2           3   \n",
       "4   (7.91, 14.454]  (32.0, 48.0]         1              2           2   \n",
       "\n",
       "   age_bin_code  fare_bin_code  \n",
       "0             1              0  \n",
       "1             2              3  \n",
       "2             1              1  \n",
       "3             2              3  \n",
       "4             2              1  "
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['survived',\n",
       " 'pclass',\n",
       " 'name',\n",
       " 'sex',\n",
       " 'age',\n",
       " 'sibsp',\n",
       " 'parch',\n",
       " 'fare',\n",
       " 'embarked',\n",
       " 'family_size',\n",
       " 'single',\n",
       " 'title',\n",
       " 'fare_bin',\n",
       " 'age_bin',\n",
       " 'sex_code',\n",
       " 'embarked_code',\n",
       " 'title_code',\n",
       " 'age_bin_code',\n",
       " 'fare_bin_code']"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 列的名称\n",
    "\n",
    "data_raw.columns.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方式一：特征选择"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "Target = ['survived'] # 标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_columns_one = ['sex', 'pclass', 'embarked', 'title', 'sibsp', 'parch', 'age', 'fare', 'family_size', \n",
    "                    'single']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_one = Target + data_columns_one"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方式二：特征选择"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_columns_two = ['sex_code', 'pclass', 'embarked_code', 'title_code', 'sibsp', 'parch', 'age', 'fare']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_two = Target + data_columns_two"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方式三：特征选择"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_columns_three = ['sex_code', 'pclass', 'embarked_code', 'title_code', 'family_size', 'age_bin_code', 'fare_bin_code']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_three = Target + data_columns_three"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过Pandas中的get_dummies() 进行编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_one_dummy = pd.get_dummies(data_raw[data_columns_one])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_one_dummy_list = data_one_dummy.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data_one_dummy_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 获取训练集和测试集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方式一：训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_one, X_test_one, y_train_one, y_test_one = model_selection.train_test_split(data_one_dummy[data_one_dummy_list],\n",
    "                                                                    data_raw[Target],\n",
    "                                                                    random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(668, 17)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_one.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(223, 17)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_one.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方式二：训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_two, X_test_two, y_train_two, y_test_tewo = model_selection.train_test_split(data_raw[data_columns_two],\n",
    "                                                                                     data_raw[Target],\n",
    "                                                                                     random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(668, 8)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_two.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(223, 8)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_two.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方式三：训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_three, X_test_three, y_train_three, y_test_three = model_selection.train_test_split(data_raw[data_columns_three],\n",
    "                                                                                            data_raw[Target],\n",
    "                                                                                            random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(668, 7)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_three.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 随机森林算法实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "rf = RandomForestClassifier(max_features='auto',\n",
    "                            random_state=1, \n",
    "                            n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_gird = {\n",
    "    'criterion' : ['gini', 'entropy'],\n",
    "    'min_samples_leaf' : [1, 5, 10],\n",
    "    'min_samples_split' : [2, 4, 10, 12, 16],\n",
    "    'n_estimators' : [50, 100, 400, 700, 1000]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "gs = GridSearchCV(estimator=rf,\n",
    "                  param_grid=param_gird,\n",
    "                  scoring= gyi'accuracy',\n",
    "                  cv=3,\n",
    "                  n_jobs=-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (1) 对特征一进行训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tommy/Torch/Torch/lib/python3.7/site-packages/sklearn/model_selection/_search.py:765: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  self.best_estimator_.fit(X, y, **fit_params)\n"
     ]
    }
   ],
   "source": [
    "gs = gs.fit(X_train_one, y_train_one)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8323839534601868\n"
     ]
    }
   ],
   "source": [
    "print(gs.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'criterion': 'entropy', 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 50}\n"
     ]
    }
   ],
   "source": [
    "print(gs.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建一个对象\n",
    "\n",
    "rf2 = RandomForestClassifier(criterion='entropy',\n",
    "                             min_samples_leaf=5,\n",
    "                             min_samples_split=12,\n",
    "                             n_estimators=50,\n",
    "                             n_jobs=-1,\n",
    "                             random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tommy/Torch/Torch/lib/python3.7/site-packages/ipykernel_launcher.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(criterion='entropy', min_samples_leaf=5,\n",
       "                       min_samples_split=12, n_estimators=50, n_jobs=-1,\n",
       "                       random_state=1)"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rf2.fit(X_train_one, y_train_one)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>age</th>\n",
       "      <th>fare</th>\n",
       "      <th>family_size</th>\n",
       "      <th>single</th>\n",
       "      <th>sex_female</th>\n",
       "      <th>sex_male</th>\n",
       "      <th>embarked_C</th>\n",
       "      <th>embarked_Q</th>\n",
       "      <th>embarked_S</th>\n",
       "      <th>title_Master</th>\n",
       "      <th>title_Miss</th>\n",
       "      <th>title_Mr</th>\n",
       "      <th>title_Mrs</th>\n",
       "      <th>title_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>17.0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>253</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>16.1000</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>706</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>45.0</td>\n",
       "      <td>13.5000</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     pclass  sibsp  parch   age     fare  family_size  single  sex_female  \\\n",
       "105       3      0      0  28.0   7.8958            1       1           0   \n",
       "68        3      4      2  17.0   7.9250            7       0           1   \n",
       "253       3      1      0  30.0  16.1000            2       0           0   \n",
       "320       3      0      0  22.0   7.2500            1       1           0   \n",
       "706       2      0      0  45.0  13.5000            1       1           1   \n",
       "\n",
       "     sex_male  embarked_C  embarked_Q  embarked_S  title_Master  title_Miss  \\\n",
       "105         1           0           0           1             0           0   \n",
       "68          0           0           0           1             0           1   \n",
       "253         1           0           0           1             0           0   \n",
       "320         1           0           0           1             0           0   \n",
       "706         0           0           0           1             0           0   \n",
       "\n",
       "     title_Mr  title_Mrs  title_other  \n",
       "105         1          0            0  \n",
       "68          0          0            0  \n",
       "253         1          0            0  \n",
       "320         1          0            0  \n",
       "706         0          1            0  "
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_one.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Variable</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>sex_male</td>\n",
       "      <td>0.158834</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>title_Mrs</td>\n",
       "      <td>0.150353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>family_size</td>\n",
       "      <td>0.138901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>embarked_C</td>\n",
       "      <td>0.123423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>fare</td>\n",
       "      <td>0.108769</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sibsp</td>\n",
       "      <td>0.092098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>single</td>\n",
       "      <td>0.053961</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>parch</td>\n",
       "      <td>0.042367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>title_other</td>\n",
       "      <td>0.034270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>age</td>\n",
       "      <td>0.019245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>title_Mr</td>\n",
       "      <td>0.018788</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>title_Master</td>\n",
       "      <td>0.013603</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>embarked_Q</td>\n",
       "      <td>0.011775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>sex_female</td>\n",
       "      <td>0.010531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>embarked_S</td>\n",
       "      <td>0.010134</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>title_Miss</td>\n",
       "      <td>0.010098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.002848</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Variable  importance\n",
       "7       sex_male    0.158834\n",
       "14     title_Mrs    0.150353\n",
       "4    family_size    0.138901\n",
       "8     embarked_C    0.123423\n",
       "3           fare    0.108769\n",
       "0          sibsp    0.092098\n",
       "5         single    0.053961\n",
       "1          parch    0.042367\n",
       "15   title_other    0.034270\n",
       "2            age    0.019245\n",
       "13      title_Mr    0.018788\n",
       "11  title_Master    0.013603\n",
       "9     embarked_Q    0.011775\n",
       "6     sex_female    0.010531\n",
       "10    embarked_S    0.010134\n",
       "12    title_Miss    0.010098\n",
       "16           NaN    0.002848"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据特征的重要性排序\n",
    "pd.concat((pd.DataFrame(X_train_one.iloc[:, 1:].columns, columns=['Variable']),\n",
    "           pd.DataFrame(rf2.feature_importances_, columns=['importance'])),\n",
    "           axis=1).sort_values(by='importance', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 在test上进行预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred = rf2.predict(X_test_one)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_df = pd.DataFrame(pred, columns=['survived'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   survived\n",
       "0         0\n",
       "1         0\n",
       "2         0\n",
       "3         1\n",
       "4         1"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## !!! 注意：在最终的test.csv上预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_val_dummy = pd.get_dummies(data_val[data_columns_one])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>age</th>\n",
       "      <th>fare</th>\n",
       "      <th>family_size</th>\n",
       "      <th>single</th>\n",
       "      <th>sex_female</th>\n",
       "      <th>sex_male</th>\n",
       "      <th>embarked_C</th>\n",
       "      <th>...</th>\n",
       "      <th>embarked_S</th>\n",
       "      <th>title_Col</th>\n",
       "      <th>title_Dona</th>\n",
       "      <th>title_Dr</th>\n",
       "      <th>title_Master</th>\n",
       "      <th>title_Miss</th>\n",
       "      <th>title_Mr</th>\n",
       "      <th>title_Mrs</th>\n",
       "      <th>title_Ms</th>\n",
       "      <th>title_Rev</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>34.5</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>22.0</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   pclass  sibsp  parch   age     fare  family_size  single  sex_female  \\\n",
       "0       3      0      0  34.5   7.8292            1       1           0   \n",
       "1       3      1      0  47.0   7.0000            2       0           1   \n",
       "2       2      0      0  62.0   9.6875            1       1           0   \n",
       "3       3      0      0  27.0   8.6625            1       1           0   \n",
       "4       3      1      1  22.0  12.2875            3       0           1   \n",
       "\n",
       "   sex_male  embarked_C  ...  embarked_S  title_Col  title_Dona  title_Dr  \\\n",
       "0         1           0  ...           0          0           0         0   \n",
       "1         0           0  ...           1          0           0         0   \n",
       "2         1           0  ...           0          0           0         0   \n",
       "3         1           0  ...           1          0           0         0   \n",
       "4         0           0  ...           1          0           0         0   \n",
       "\n",
       "   title_Master  title_Miss  title_Mr  title_Mrs  title_Ms  title_Rev  \n",
       "0             0           0         1          0         0          0  \n",
       "1             0           0         0          1         0          0  \n",
       "2             0           0         1          0         0          0  \n",
       "3             0           0         1          0         0          0  \n",
       "4             0           0         0          1         0          0  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_val_dummy.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_val_dummy_list = data_val_dummy.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pclass',\n",
       " 'sibsp',\n",
       " 'parch',\n",
       " 'age',\n",
       " 'fare',\n",
       " 'family_size',\n",
       " 'single',\n",
       " 'sex_female',\n",
       " 'sex_male',\n",
       " 'embarked_C',\n",
       " 'embarked_Q',\n",
       " 'embarked_S',\n",
       " 'title_Col',\n",
       " 'title_Dona',\n",
       " 'title_Dr',\n",
       " 'title_Master',\n",
       " 'title_Miss',\n",
       " 'title_Mr',\n",
       " 'title_Mrs',\n",
       " 'title_Ms',\n",
       " 'title_Rev']"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_val_dummy_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_val = rf2.predict(data_val_dummy[[\n",
    "                                         'pclass',\n",
    "                                         'age',\n",
    "                                         'fare',\n",
    "                                         'family_size',\n",
    "                                         'single',\n",
    "                                         'sex_female',\n",
    "                                         'sex_male',\n",
    "                                         'embarked_C',\n",
    "                                         'embarked_Q',\n",
    "                                         'embarked_S',\n",
    "                                         'title_Col',\n",
    "                                         'title_Dona',\n",
    "                                         'title_Dr',\n",
    "                                         'title_Master',\n",
    "                                         'title_Miss',\n",
    "                                         'title_Mr',\n",
    "                                         'title_Mrs',]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_val_df = pd.DataFrame(pred_val, columns=['survived'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   survived\n",
       "0         0\n",
       "1         0\n",
       "2         0\n",
       "3         1\n",
       "4         0\n",
       "5         1\n",
       "6         0\n",
       "7         1\n",
       "8         1\n",
       "9         0"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_val_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
